import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv("training_data.csv")
df.shape
(223, 64)
df.dtypes
X01 float64
X02 float64
X03 float64
X04 float64
X05 float64
...
V26 float64
V27 float64
V28 float64
V29 float64
Y int64
Length: 64, dtype: object
df.isna().sum()
X01 0
X02 0
X03 0
X04 0
X05 0
..
V26 0
V27 0
V28 0
V29 0
Y 0
Length: 64, dtype: int64
df.nunique()
X01 223
X02 223
X03 223
X04 223
X05 223
...
V26 223
V27 223
V28 223
V29 223
Y 2
Length: 64, dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 223 entries, 0 to 222 Data columns (total 64 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 X01 223 non-null float64 1 X02 223 non-null float64 2 X03 223 non-null float64 3 X04 223 non-null float64 4 X05 223 non-null float64 5 X06 223 non-null float64 6 X07 223 non-null float64 7 X08 223 non-null float64 8 X09 223 non-null float64 9 X10 223 non-null float64 10 X11 223 non-null float64 11 X12 223 non-null float64 12 X13 223 non-null float64 13 X14 223 non-null float64 14 X15 223 non-null float64 15 X16 223 non-null float64 16 X17 223 non-null float64 17 X18 223 non-null float64 18 X19 223 non-null float64 19 X20 223 non-null float64 20 X21 223 non-null float64 21 X22 223 non-null float64 22 X23 223 non-null float64 23 X24 223 non-null float64 24 X25 223 non-null float64 25 Z01 223 non-null float64 26 Z02 223 non-null float64 27 Z03 223 non-null float64 28 Z04 223 non-null float64 29 Z05 223 non-null float64 30 Z06 223 non-null float64 31 Z07 223 non-null float64 32 Z08 223 non-null float64 33 Z09 223 non-null float64 34 V01 223 non-null float64 35 V02 223 non-null float64 36 V03 223 non-null float64 37 V04 223 non-null float64 38 V05 223 non-null float64 39 V06 223 non-null float64 40 V07 223 non-null float64 41 V08 223 non-null float64 42 V09 223 non-null float64 43 V10 223 non-null float64 44 V11 223 non-null float64 45 V12 223 non-null float64 46 V13 223 non-null float64 47 V14 223 non-null float64 48 V15 223 non-null float64 49 V16 223 non-null float64 50 V17 223 non-null float64 51 V18 223 non-null float64 52 V19 223 non-null float64 53 V20 223 non-null float64 54 V21 223 non-null float64 55 V22 223 non-null float64 56 V23 223 non-null float64 57 V24 223 non-null float64 58 V25 223 non-null float64 59 V26 223 non-null float64 60 V27 223 non-null float64 61 V28 223 non-null float64 62 V29 223 non-null float64 63 Y 223 non-null int64 dtypes: float64(63), int64(1) memory usage: 111.6 KB
I will try to keep the last column Y as categorical because it has only two unique values but the rest as numeric.
df.Y.value_counts()
1 138 0 85 Name: Y, dtype: int64
df.describe(include='all')
| X01 | X02 | X03 | X04 | X05 | X06 | X07 | X08 | X09 | X10 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | V29 | Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.230000e+02 | 2.230000e+02 | 2.230000e+02 | 2.230000e+02 | 2.230000e+02 | 2.230000e+02 | 2.230000e+02 | 2.230000e+02 | 2.230000e+02 | 2.230000e+02 | ... | 223.000000 | 223.000000 | 223.000000 | 223.000000 | 223.000000 | 223.000000 | 223.000000 | 223.000000 | 223.000000 | 223.000000 |
| mean | -6.372581e-17 | -1.035544e-16 | -1.194859e-16 | 6.521938e-17 | -6.970010e-17 | 2.588861e-17 | 4.381149e-17 | 5.974294e-18 | -2.190575e-17 | -2.788004e-17 | ... | 0.022254 | 0.118454 | 0.080438 | 0.113141 | 0.216649 | 0.124892 | 0.949645 | 0.134703 | -86.306579 | 0.618834 |
| std | 1.000479e+01 | 3.432796e+00 | 1.876591e+00 | 1.184369e+00 | 9.268016e-01 | 7.579259e-01 | 7.087896e-01 | 5.524554e-01 | 5.110352e-01 | 4.389306e-01 | ... | 1.511050 | 1.662396 | 1.719626 | 1.804158 | 2.106987 | 1.783030 | 0.037178 | 0.044716 | 46.306093 | 0.486766 |
| min | -2.877510e+01 | -1.048100e+01 | -5.941940e+00 | -3.445607e+00 | -3.593157e+00 | -2.791616e+00 | -2.043792e+00 | -1.363138e+00 | -1.566863e+00 | -1.539497e+00 | ... | -3.778512 | -5.687229 | -6.953736 | -6.790080 | -10.541583 | -5.783808 | 0.788204 | 0.077250 | -182.745288 | 0.000000 |
| 25% | -6.004972e+00 | -2.137978e+00 | -1.174047e+00 | -6.187325e-01 | -5.947222e-01 | -5.259962e-01 | -5.143738e-01 | -3.909297e-01 | -3.147642e-01 | -2.929840e-01 | ... | -1.020518 | -0.831423 | -0.895099 | -0.995807 | -0.965790 | -0.970357 | 0.932600 | 0.108477 | -115.559745 | 0.000000 |
| 50% | 1.597946e-01 | 2.169751e-01 | 8.468241e-02 | 3.652756e-02 | -2.190519e-02 | 1.121122e-02 | -3.799648e-02 | -6.275792e-02 | 1.811046e-02 | -3.987984e-02 | ... | -0.122100 | 0.111196 | 0.016036 | 0.025615 | 0.190205 | 0.032284 | 0.960524 | 0.127374 | -91.517609 | 1.000000 |
| 75% | 6.971508e+00 | 2.203972e+00 | 1.223745e+00 | 7.994909e-01 | 5.912108e-01 | 5.509297e-01 | 4.980837e-01 | 3.620381e-01 | 2.975528e-01 | 2.852263e-01 | ... | 1.002306 | 1.086965 | 1.133833 | 1.174133 | 1.465043 | 1.268731 | 0.976685 | 0.146303 | -63.574713 | 1.000000 |
| max | 2.482622e+01 | 1.193088e+01 | 4.853514e+00 | 3.064266e+00 | 2.411752e+00 | 2.383175e+00 | 1.918046e+00 | 1.886586e+00 | 1.739986e+00 | 1.401598e+00 | ... | 5.958354 | 6.026561 | 6.528996 | 5.999120 | 6.558603 | 6.679002 | 0.993385 | 0.467255 | 162.318266 | 1.000000 |
8 rows × 64 columns
lf= df.reset_index().\
rename(columns={'index': 'rowid'}).\
melt(id_vars=['rowid', 'Y']).copy()
lf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14049 entries, 0 to 14048 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 rowid 14049 non-null int64 1 Y 14049 non-null int64 2 variable 14049 non-null object 3 value 14049 non-null float64 dtypes: float64(1), int64(2), object(1) memory usage: 439.2+ KB
df
| X01 | X02 | X03 | X04 | X05 | X06 | X07 | X08 | X09 | X10 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | V29 | Y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -2.907070 | 1.266914 | -0.332039 | -0.248782 | 0.200432 | -0.008683 | 0.316866 | -0.323924 | 0.030199 | -0.205569 | ... | 0.552065 | 0.489846 | 1.113175 | -0.240931 | -0.108875 | -0.114766 | 0.841632 | 0.104236 | -121.810994 | 1 |
| 1 | -4.608052 | 4.672474 | 0.154697 | 0.268719 | -0.842417 | 0.055191 | 0.622848 | -0.260097 | -0.651079 | 1.096821 | ... | 1.989505 | 1.355984 | 1.656029 | 2.428749 | 1.068637 | 1.945175 | 0.950544 | 0.143290 | -59.362086 | 1 |
| 2 | 4.338816 | 5.684974 | 1.868370 | -1.883006 | 0.589758 | 0.932240 | -0.646026 | 0.183410 | 0.132287 | -0.426386 | ... | 2.128248 | 2.553980 | 2.661607 | 2.625942 | 4.462401 | 3.621299 | 0.992381 | 0.127803 | -79.575912 | 1 |
| 3 | -1.835062 | 0.427501 | -2.226023 | 0.700375 | -1.144850 | 1.188100 | 0.727831 | -0.271734 | 0.003246 | 0.138308 | ... | -0.856860 | -0.766993 | -0.882442 | -0.832196 | -0.377106 | -0.633452 | 0.964183 | 0.088978 | -139.426151 | 0 |
| 4 | 13.990969 | -3.877269 | 1.921605 | 0.162288 | 2.316402 | -0.161137 | -0.099180 | 0.514620 | -0.551956 | -0.517779 | ... | -0.333291 | 0.126656 | -0.557930 | -0.185135 | -0.639549 | -0.245234 | 0.845817 | 0.172305 | -29.433234 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 218 | -3.454719 | -0.911378 | 1.072785 | 0.845129 | 1.817682 | -0.034388 | -0.394277 | -0.260803 | 0.018067 | -0.718457 | ... | -1.041533 | -0.585809 | -0.831599 | 0.315496 | -2.098443 | -0.767479 | 0.941532 | 0.112127 | -100.512718 | 0 |
| 219 | 0.493271 | 2.184699 | 0.107755 | -1.852857 | 0.329977 | 0.679100 | 0.933463 | 0.110013 | 0.234102 | -0.080677 | ... | 1.599908 | 1.485953 | 1.326105 | 0.738920 | 1.782694 | 1.221559 | 0.965555 | 0.098206 | -118.299085 | 1 |
| 220 | -5.009510 | -0.488819 | 0.838883 | 0.615571 | 0.069954 | -0.365945 | -0.290072 | 0.777958 | -0.392741 | 0.126245 | ... | -0.769891 | -0.384573 | -0.894474 | -1.071848 | -0.755456 | -1.001084 | 0.898706 | 0.106729 | -102.237684 | 1 |
| 221 | -2.665672 | -0.546497 | -0.545406 | -0.477273 | 1.476238 | -0.019403 | 0.280312 | 0.359992 | -0.136810 | 0.116397 | ... | -0.092393 | -0.962929 | -0.580874 | -0.750556 | -1.618888 | -1.083649 | 0.964139 | 0.096673 | -119.015441 | 1 |
| 222 | 0.055307 | -1.709225 | -0.763259 | -0.755982 | -0.106586 | -1.174575 | -0.608212 | -0.483285 | -0.055759 | -0.658711 | ... | 0.076693 | 0.434874 | 0.371620 | 0.947893 | 0.775466 | 0.472459 | 0.914727 | 0.137963 | -78.872248 | 1 |
223 rows × 64 columns
lf
| rowid | Y | variable | value | |
|---|---|---|---|---|
| 0 | 0 | 1 | X01 | -2.907070 |
| 1 | 1 | 1 | X01 | -4.608052 |
| 2 | 2 | 1 | X01 | 4.338816 |
| 3 | 3 | 0 | X01 | -1.835062 |
| 4 | 4 | 1 | X01 | 13.990969 |
| ... | ... | ... | ... | ... |
| 14044 | 218 | 0 | V29 | -100.512718 |
| 14045 | 219 | 1 | V29 | -118.299085 |
| 14046 | 220 | 1 | V29 | -102.237684 |
| 14047 | 221 | 1 | V29 | -119.015441 |
| 14048 | 222 | 1 | V29 | -78.872248 |
14049 rows × 4 columns
lf.variable.value_counts()
X01 223
V14 223
V01 223
V02 223
V03 223
...
Z02 223
Z03 223
Z04 223
Z05 223
V29 223
Name: variable, Length: 63, dtype: int64
sns.displot(data=lf, kind='hist', x='value', col='variable', col_wrap=8,
common_bins=False, common_norm=False,
facet_kws={'sharex':False, 'sharey': False})
plt.show()
Definitely there are symmetric variables like X01, X02, X09, X12, Z02, Z04, V02, V04, V06, V07 etc.
Some are not e.g., X10, X19, X22 are bimodal, Z07 is left-skewed, Z08 is right skewed.
sns.displot(data=lf, kind='kde', x='value', col='variable', col_wrap=8,
hue='Y', common_norm=False,
facet_kws={'sharex':False, 'sharey': False})
plt.show()
Usually the kde plots are similar accross the categories but in some cases e.g., X19 Y=1 is bimodal but Y=0 is unimodal. In V and Z variable Y=1 has more concentration to the right of Y=0.
sns.catplot(data=df, kind='count', x='Y')
plt.show()
df_part_features = df[['X01','X02','Z01','Z02', 'V01', 'V02','Y' ]].copy()
df_part_features.melt(ignore_index=False)
lf_part = df_part_features.reset_index().\
rename(columns={'index': 'rowid'}).\
melt(id_vars=['rowid', 'Y'])
lf_part
| rowid | Y | variable | value | |
|---|---|---|---|---|
| 0 | 0 | 1 | X01 | -2.907070 |
| 1 | 1 | 1 | X01 | -4.608052 |
| 2 | 2 | 1 | X01 | 4.338816 |
| 3 | 3 | 0 | X01 | -1.835062 |
| 4 | 4 | 1 | X01 | 13.990969 |
| ... | ... | ... | ... | ... |
| 1333 | 218 | 0 | V02 | 0.046036 |
| 1334 | 219 | 1 | V02 | 0.168402 |
| 1335 | 220 | 1 | V02 | -0.041844 |
| 1336 | 221 | 1 | V02 | -0.208668 |
| 1337 | 222 | 1 | V02 | -0.080534 |
1338 rows × 4 columns
df_clean=df.dropna().copy()
df_clean['Y']=df_clean.Y.astype('category')
df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 223 entries, 0 to 222 Data columns (total 64 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 X01 223 non-null float64 1 X02 223 non-null float64 2 X03 223 non-null float64 3 X04 223 non-null float64 4 X05 223 non-null float64 5 X06 223 non-null float64 6 X07 223 non-null float64 7 X08 223 non-null float64 8 X09 223 non-null float64 9 X10 223 non-null float64 10 X11 223 non-null float64 11 X12 223 non-null float64 12 X13 223 non-null float64 13 X14 223 non-null float64 14 X15 223 non-null float64 15 X16 223 non-null float64 16 X17 223 non-null float64 17 X18 223 non-null float64 18 X19 223 non-null float64 19 X20 223 non-null float64 20 X21 223 non-null float64 21 X22 223 non-null float64 22 X23 223 non-null float64 23 X24 223 non-null float64 24 X25 223 non-null float64 25 Z01 223 non-null float64 26 Z02 223 non-null float64 27 Z03 223 non-null float64 28 Z04 223 non-null float64 29 Z05 223 non-null float64 30 Z06 223 non-null float64 31 Z07 223 non-null float64 32 Z08 223 non-null float64 33 Z09 223 non-null float64 34 V01 223 non-null float64 35 V02 223 non-null float64 36 V03 223 non-null float64 37 V04 223 non-null float64 38 V05 223 non-null float64 39 V06 223 non-null float64 40 V07 223 non-null float64 41 V08 223 non-null float64 42 V09 223 non-null float64 43 V10 223 non-null float64 44 V11 223 non-null float64 45 V12 223 non-null float64 46 V13 223 non-null float64 47 V14 223 non-null float64 48 V15 223 non-null float64 49 V16 223 non-null float64 50 V17 223 non-null float64 51 V18 223 non-null float64 52 V19 223 non-null float64 53 V20 223 non-null float64 54 V21 223 non-null float64 55 V22 223 non-null float64 56 V23 223 non-null float64 57 V24 223 non-null float64 58 V25 223 non-null float64 59 V26 223 non-null float64 60 V27 223 non-null float64 61 V28 223 non-null float64 62 V29 223 non-null float64 63 Y 223 non-null category dtypes: category(1), float64(63) memory usage: 110.2 KB
fig, ax = plt.subplots(figsize=(16,16))
sns.heatmap(data = df_clean.corr(numeric_only=True),
vmin=-1, vmax=1, center=0,
cmap='coolwarm', cbar=False,
ax=ax)
plt.show()
groups=df_clean.Y.unique().tolist()
groups
[1, 0]
corr_per_group=df_clean.groupby('Y').corr()
corr_per_group
| X01 | X02 | X03 | X04 | X05 | X06 | X07 | X08 | X09 | X10 | ... | V20 | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | V29 | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Y | ||||||||||||||||||||||
| 0 | X01 | 1.000000 | -0.205255 | -0.087020 | 0.181176 | -0.106155 | 0.022670 | 0.203954 | -0.116990 | -0.117701 | -0.052425 | ... | 0.489908 | 0.472819 | 0.402452 | 0.469191 | 0.434905 | 0.385855 | 0.496872 | -0.214225 | -0.457191 | -0.407074 |
| X02 | -0.205255 | 1.000000 | -0.000509 | 0.068710 | 0.028603 | 0.046520 | -0.107031 | 0.021405 | -0.192939 | -0.014649 | ... | 0.624683 | 0.539181 | 0.587521 | 0.554541 | 0.491096 | 0.469262 | 0.493617 | -0.339179 | -0.212051 | -0.202974 | |
| X03 | -0.087020 | -0.000509 | 1.000000 | 0.101524 | 0.037781 | -0.004955 | -0.129379 | -0.099406 | 0.025789 | -0.081244 | ... | -0.003276 | 0.094116 | 0.134313 | 0.178055 | 0.048545 | 0.200277 | 0.143240 | -0.088684 | -0.077255 | -0.133750 | |
| X04 | 0.181176 | 0.068710 | 0.101524 | 1.000000 | -0.021093 | 0.114410 | 0.116193 | 0.093050 | 0.066907 | 0.004890 | ... | -0.064748 | -0.138889 | -0.228806 | -0.112803 | -0.163159 | -0.196220 | -0.180789 | -0.024765 | -0.147766 | -0.108528 | |
| X05 | -0.106155 | 0.028603 | 0.037781 | -0.021093 | 1.000000 | 0.011185 | -0.165383 | 0.146792 | 0.070581 | -0.049404 | ... | -0.045776 | -0.078500 | -0.180840 | -0.218230 | -0.173990 | -0.193015 | -0.213615 | 0.048804 | 0.070151 | 0.103119 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1 | V25 | 0.449221 | 0.535932 | 0.207738 | -0.319819 | -0.171985 | -0.109611 | -0.093405 | -0.094394 | 0.120891 | -0.064388 | ... | 0.728426 | 0.784733 | 0.757744 | 0.860391 | 0.773741 | 1.000000 | 0.887672 | 0.301572 | 0.384675 | 0.362597 |
| V26 | 0.509917 | 0.642393 | 0.209614 | -0.329812 | -0.177260 | -0.125603 | -0.028217 | -0.056534 | 0.093481 | -0.102301 | ... | 0.857264 | 0.900553 | 0.880138 | 0.942585 | 0.946392 | 0.887672 | 1.000000 | 0.298503 | 0.402619 | 0.366128 | |
| V27 | 0.020993 | 0.255429 | 0.191936 | -0.078613 | -0.123966 | -0.051715 | 0.009738 | 0.008055 | -0.026474 | 0.057121 | ... | 0.172014 | 0.282729 | 0.233793 | 0.286658 | 0.300219 | 0.301572 | 0.298503 | 1.000000 | -0.156757 | -0.175995 | |
| V28 | 0.481938 | 0.099421 | -0.056146 | -0.048289 | -0.303781 | 0.006422 | -0.132591 | 0.104939 | 0.089362 | -0.041137 | ... | 0.359102 | 0.299891 | 0.337032 | 0.430226 | 0.338856 | 0.384675 | 0.402619 | -0.156757 | 1.000000 | 0.961420 | |
| V29 | 0.487228 | 0.037701 | -0.036594 | -0.055237 | -0.241634 | 0.026896 | -0.116197 | 0.061452 | 0.045616 | -0.030766 | ... | 0.318734 | 0.264227 | 0.306602 | 0.374609 | 0.306797 | 0.362597 | 0.366128 | -0.175995 | 0.961420 | 1.000000 |
126 rows × 63 columns
Lets study correlation in groups now:
fig, axs = plt.subplots(len(groups),1, figsize=(18, 18), sharex=True, sharey=True )
for ix in range(len(groups)):
sns.heatmap( data = corr_per_group.loc[ groups[ ix ] ],
vmin=-1, vmax=1, center = 0,
cmap='coolwarm', cbar=False,
ax=axs[ix] )
axs[ ix ].set_title('Y: %s' % groups[ ix ] )
plt.show()
Impossible to make minute observation but we can see that there are some strong possitive association in consecutive in V and negative in V27,V28,V29 with most of the other Vs.
df_part_features1 = df[['X01','X02','Z01','Z02', 'V01', 'V02' ]].copy()
sns.pairplot(data=df_part_features1, diag_kws={'common_norm': False} )
plt.show()
fig, ax = plt.subplots()
sns.heatmap(data = df_part_features1.corr(numeric_only=True),
vmin=-1, vmax=1, center = 0, fmt='.3f',
cmap='coolwarm',
annot=True, annot_kws={'size': 10},
ax=ax)
plt.show()
There is a good linear relationship between (X01, Z01),(X01, V01), (Z01, V01) and almost no relationship between (X01,V02).
sns.pairplot(data=df_part_features, hue='Y', diag_kws={'common_norm': False})
plt.show()
I can see that Y=1 is more variable in relationship between continuous-continuous variables than Y=0. We can also observe another thing that it is difficult to isolate the data with catogories of Y so far. It should be evident because originally we have 63 feature variables. Later on we will come back to this problem when we will do PCA analysis.
df_clean.shape
(223, 64)
fig, axs = plt.subplots(2,3,figsize=(12,6))
sns.violinplot(data=df_clean, x='Y', y='X01', inner='quartile', ax=axs[0,0])
sns.violinplot(data=df_clean, x='Y', y='X02', inner='quartile', ax=axs[0,1])
sns.violinplot(data=df_clean, x='Y', y='Z01', inner='quartile', ax=axs[0,2])
sns.violinplot(data=df_clean, x='Y', y='Z02', inner='quartile', ax=axs[1,0])
sns.violinplot(data=df_clean, x='Y', y='V01', inner='quartile', ax=axs[1,1])
sns.violinplot(data=df_clean, x='Y', y='V02', inner='quartile', ax=axs[1,2])
plt.show()
lf_part= df.reset_index().\
rename(columns={'index':'rowid'}).\
melt(id_vars=['rowid', 'Y'],
value_vars=['X01','X02','Z01','Z02', 'V01', 'V02']).copy()
lf_part.columns
Index(['rowid', 'Y', 'variable', 'value'], dtype='object')
sns.catplot(data=lf_part, x='Y', y='value', col='variable', kind='box',
col_wrap=3, sharey=False)
plt.show()
corr_per_group1 = df_clean.loc[ :, ['X01','X02','Z01','Z02', 'V01', 'V02','Y']].groupby(['Y']).corr()
fig, axs = plt.subplots(1,len(groups), figsize=(18, 6), sharex=True, sharey=True )
for ix in range(len(groups)):
sns.heatmap( data = corr_per_group1.loc[ groups[ ix ] ],
vmin=-1, vmax=1, center = 0,
cmap='coolwarm', cbar=False,
ax=axs[ix] )
axs[ ix ].set_title('Y: %s' % groups[ ix ] )
plt.show()